import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("student-mat.csv", sep = ';') # student math score file
data.head()
| school | sex | age | address | famsize | Pstatus | Medu | Fedu | Mjob | Fjob | ... | famrel | freetime | goout | Dalc | Walc | health | absences | G1 | G2 | G3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | GP | F | 18 | U | GT3 | A | 4 | 4 | at_home | teacher | ... | 4 | 3 | 4 | 1 | 1 | 3 | 6 | 5 | 6 | 6 |
| 1 | GP | F | 17 | U | GT3 | T | 1 | 1 | at_home | other | ... | 5 | 3 | 3 | 1 | 1 | 3 | 4 | 5 | 5 | 6 |
| 2 | GP | F | 15 | U | LE3 | T | 1 | 1 | at_home | other | ... | 4 | 3 | 2 | 2 | 3 | 3 | 10 | 7 | 8 | 10 |
| 3 | GP | F | 15 | U | GT3 | T | 4 | 2 | health | services | ... | 3 | 2 | 2 | 1 | 1 | 5 | 2 | 15 | 14 | 15 |
| 4 | GP | F | 16 | U | GT3 | T | 3 | 3 | other | other | ... | 4 | 3 | 2 | 1 | 2 | 5 | 4 | 6 | 10 | 10 |
5 rows × 33 columns
data.shape
(395, 33)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 395 entries, 0 to 394 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 school 395 non-null object 1 sex 395 non-null object 2 age 395 non-null int64 3 address 395 non-null object 4 famsize 395 non-null object 5 Pstatus 395 non-null object 6 Medu 395 non-null int64 7 Fedu 395 non-null int64 8 Mjob 395 non-null object 9 Fjob 395 non-null object 10 reason 395 non-null object 11 guardian 395 non-null object 12 traveltime 395 non-null int64 13 studytime 395 non-null int64 14 failures 395 non-null int64 15 schoolsup 395 non-null object 16 famsup 395 non-null object 17 paid 395 non-null object 18 activities 395 non-null object 19 nursery 395 non-null object 20 higher 395 non-null object 21 internet 395 non-null object 22 romantic 395 non-null object 23 famrel 395 non-null int64 24 freetime 395 non-null int64 25 goout 395 non-null int64 26 Dalc 395 non-null int64 27 Walc 395 non-null int64 28 health 395 non-null int64 29 absences 395 non-null int64 30 G1 395 non-null int64 31 G2 395 non-null int64 32 G3 395 non-null int64 dtypes: int64(16), object(17) memory usage: 102.0+ KB
The following are the explanations for the column names:
Demographic Information:
school: School name. ('GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)sex: Student's gender. ('F' - female or 'M' - male)age: Student's age. address: Type of address ('U' - urban or 'R' - rural).famsize: Family size. ('LE3' - less or equal to 3 or 'GT3' - greater than 3)Pstatus: Parent's cohabitation status. ('T' - living together or 'A' - apart)Educational Background:
Medu: Mother's education level.Fedu: Father's education level.Mjob: Mother's job.Fjob: Father's job.Reasons for Choosing School:
reason: Reason for choosing the school.Guardian Information:
guardian: Student's guardian.Time Factors:
traveltime: Travel time to school from home.studytime: Weekly study time.Academic Performance:
failures: Number of past class failures.schoolsup: Extra educational support from the school.famsup: Family educational support.paid: Extra paid classes within the course subject (Math or Portuguese).activities: Extra-curricular activities.nursery: Attended at nursery school.higher: Plans to pursue higher education.internet: Internet access at home.romantic: In a romantic relationship.Social and Personal Life:
famrel: Quality of family relationships.freetime: Free time after school.goout: Going out with friends.Dalc: Workday alcohol consumption.Walc: Weekend alcohol consumption.health: Current health status.Attendance and Grades:
absences: Number of school absences.G1, G2, G3: Grades in three different periods.
G1 - first period grade (numeric: from 0 to 20) - Maths
G2 - second period grade (numeric: from 0 to 20) - PortugueseG3 - final grade (numeric: from 0 to 20, output target) - combination of first two subjectscolumn_names = {
'school': 'School',
'sex': 'Gender',
'age': 'Age',
'address': 'Address',
'famsize': 'Family_Size',
'Pstatus': 'Parent_Cohabitation_Status',
'Medu': 'Mother_Education',
'Fedu': 'Father_Education',
'Mjob': 'Mother_Job',
'Fjob': 'Father_Job',
'reason': 'Reason_for_Choosing_School',
'guardian': 'Guardian',
'traveltime': 'Travel_Time',
'studytime': 'Study_Time',
'failures': 'Past_Failures',
'schoolsup': 'School_Support',
'famsup': 'Family_Support',
'paid': 'Paid_Classes',
'activities': 'Extra_Curricular_Activities',
'nursery': 'Nursery_Attendance',
'higher': 'Plans_for_Higher_Education',
'internet': 'Internet_Access',
'romantic': 'In_Relationship',
'famrel': 'Family_Relationship_Quality',
'freetime': 'Free_Time',
'goout': 'Go_Out_with_Friends',
'Dalc': 'Workday_Alcohol_Consumption',
'Walc': 'Weekend_Alcohol_Consumption',
'health': 'Health_Status',
'absences': 'Absences',
'G1': 'Grade_Period_1',
'G2': 'Grade_Period_2',
'G3': 'Grade_Period_3'
}
data.rename(columns=column_names, inplace=True)
data.columns = data.columns.str.lower()
data.columns
Index(['school', 'gender', 'age', 'address', 'family_size',
'parent_cohabitation_status', 'mother_education', 'father_education',
'mother_job', 'father_job', 'reason_for_choosing_school', 'guardian',
'travel_time', 'study_time', 'past_failures', 'school_support',
'family_support', 'paid_classes', 'extra_curricular_activities',
'nursery_attendance', 'plans_for_higher_education', 'internet_access',
'in_relationship', 'family_relationship_quality', 'free_time',
'go_out_with_friends', 'workday_alcohol_consumption',
'weekend_alcohol_consumption', 'health_status', 'absences',
'grade_period_1', 'grade_period_2', 'grade_period_3'],
dtype='object')
data.isnull().sum()
school 0 gender 0 age 0 address 0 family_size 0 parent_cohabitation_status 0 mother_education 0 father_education 0 mother_job 0 father_job 0 reason_for_choosing_school 0 guardian 0 travel_time 0 study_time 0 past_failures 0 school_support 0 family_support 0 paid_classes 0 extra_curricular_activities 0 nursery_attendance 0 plans_for_higher_education 0 internet_access 0 in_relationship 0 family_relationship_quality 0 free_time 0 go_out_with_friends 0 workday_alcohol_consumption 0 weekend_alcohol_consumption 0 health_status 0 absences 0 grade_period_1 0 grade_period_2 0 grade_period_3 0 dtype: int64
# checking for duplicates
duplicate_rows = data.duplicated()
total_duplicates = data.duplicated().sum()
total_duplicates
0
data[duplicate_rows]
| school | gender | age | address | family_size | parent_cohabitation_status | mother_education | father_education | mother_job | father_job | ... | family_relationship_quality | free_time | go_out_with_friends | workday_alcohol_consumption | weekend_alcohol_consumption | health_status | absences | grade_period_1 | grade_period_2 | grade_period_3 |
|---|
0 rows × 33 columns
# looking for correlations
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
corr_matrix = data[numeric_columns].corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr_matrix, annot = True, cmap = 'coolwarm', fmt = ".2f")
plt.show()
either positive or negative) between the numeric columns in the data set.summary = data.describe()
summary
| age | mother_education | father_education | travel_time | study_time | past_failures | family_relationship_quality | free_time | go_out_with_friends | workday_alcohol_consumption | weekend_alcohol_consumption | health_status | absences | grade_period_1 | grade_period_2 | grade_period_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 395.000000 | 395.000000 | 395.000000 | 395.000000 | 395.000000 | 395.000000 | 395.000000 | 395.000000 | 395.000000 | 395.000000 | 395.000000 | 395.000000 | 395.000000 | 395.000000 | 395.000000 | 395.000000 |
| mean | 16.696203 | 2.749367 | 2.521519 | 1.448101 | 2.035443 | 0.334177 | 3.944304 | 3.235443 | 3.108861 | 1.481013 | 2.291139 | 3.554430 | 5.708861 | 10.908861 | 10.713924 | 10.415190 |
| std | 1.276043 | 1.094735 | 1.088201 | 0.697505 | 0.839240 | 0.743651 | 0.896659 | 0.998862 | 1.113278 | 0.890741 | 1.287897 | 1.390303 | 8.003096 | 3.319195 | 3.761505 | 4.581443 |
| min | 15.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 3.000000 | 0.000000 | 0.000000 |
| 25% | 16.000000 | 2.000000 | 2.000000 | 1.000000 | 1.000000 | 0.000000 | 4.000000 | 3.000000 | 2.000000 | 1.000000 | 1.000000 | 3.000000 | 0.000000 | 8.000000 | 9.000000 | 8.000000 |
| 50% | 17.000000 | 3.000000 | 2.000000 | 1.000000 | 2.000000 | 0.000000 | 4.000000 | 3.000000 | 3.000000 | 1.000000 | 2.000000 | 4.000000 | 4.000000 | 11.000000 | 11.000000 | 11.000000 |
| 75% | 18.000000 | 4.000000 | 3.000000 | 2.000000 | 2.000000 | 0.000000 | 5.000000 | 4.000000 | 4.000000 | 2.000000 | 3.000000 | 5.000000 | 8.000000 | 13.000000 | 13.000000 | 14.000000 |
| max | 22.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 3.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 75.000000 | 19.000000 | 19.000000 | 20.000000 |
data.shape
(395, 33)
import matplotlib.pyplot as plt
import numpy as np
work_columns = ['mother_job', 'father_job']
working_parents = data[work_columns]
counts = working_parents.apply(lambda x: '_'.join(x), axis=1).value_counts()
counts
other_other 104 services_services 43 services_other 42 at_home_other 33 other_services 24 teacher_other 21 teacher_services 19 health_other 17 at_home_services 15 teacher_teacher 12 health_services 10 services_teacher 8 at_home_at_home 7 health_health 6 other_teacher 6 services_at_home 6 other_at_home 5 services_health 4 teacher_health 4 at_home_teacher 2 other_health 2 teacher_at_home 2 at_home_health 2 health_teacher 1 dtype: int64
working_parents
| mother_job | father_job | |
|---|---|---|
| 0 | at_home | teacher |
| 1 | at_home | other |
| 2 | at_home | other |
| 3 | health | services |
| 4 | other | other |
| ... | ... | ... |
| 390 | services | services |
| 391 | services | services |
| 392 | other | other |
| 393 | services | other |
| 394 | other | at_home |
395 rows × 2 columns
counts.sum()
395
no_parents_work = counts.get('at_home_at_home', 0)
one_parent_works = (
counts.get('at_home_other', 0) +
counts.get('at_home_services', 0) +
counts.get('at_home_teacher', 0) +
counts.get('at_home_health', 0) +
counts.get('other_at_home', 0) +
counts.get('services_at_home', 0) +
counts.get('teacher_at_home', 0) +
counts.get('health_at_home', 0)
)
total_entries = sum(counts.values)
two_parents_work = total_entries - (no_parents_work + one_parent_works)
print('no_parents_work:' ,no_parents_work)
print('one_parent_works:' ,one_parent_works)
print('two_parents_work:' ,two_parents_work)
no_parents_work: 7 one_parent_works: 65 two_parents_work: 323
import plotly.graph_objects as go
categories = ['Two Parents Work', 'One Parent Works', 'No Parents Work']
values = [two_parents_work, one_parent_works, no_parents_work, two_parents_work] # Repeat the first value
total_students = len(data)
proportions = [100 * value / total_students for value in values]
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=proportions + proportions[:1] + ['%'],
theta=categories + [categories[0]],
fill='toself',
name='Proportion of Students',
))
fig.update_layout(
polar=dict(
radialaxis=dict(visible=False),
),
showlegend=False,
title=dict(text="Proportion of Students with Working Parents", x=0.5, y=0.9, xanchor='center', yanchor='top')
)
fig.show()
We can interpret from the triangle that
farthest or the length of the line is the lengthiest among all the three.We can conclude that most of the children parents are employed and only very few children have their parents who are un-employed which might or might not affect their grades. There are many factors at this point of time that have to be analyzed about the children whose parents aren't working
import seaborn as sns
import matplotlib.pyplot as plt
# Create a new column in the DataFrame to represent the parental work situation
data['parental_work_situation'] = working_parents.apply(
lambda x:
'Two Parents Work' if x['mother_job'] != 'at_home' and x['father_job'] != 'at_home'
else 'One Parent Works' if (x['mother_job'] != 'at_home' and x['father_job'] == 'at_home') or (x['mother_job'] == 'at_home' and x['father_job'] != 'at_home')
else 'No Parents Work' if x['mother_job'] == 'at_home' and x['father_job'] == 'at_home'
else 'Other', # Add an 'Other' category for unexpected cases
axis=1
)
# Define the order of categories
category_order = ['Two Parents Work', 'One Parent Works', 'No Parents Work']
# Plotting box plots
plt.figure(figsize=(12, 8))
sns.boxplot(x='parental_work_situation', y='grade_period_3', data=data, order=category_order)
plt.title('Relationship between Parental Work Situation and G3')
plt.show()
We can see that the median of the children with both the parents work is obviously higher that children with only single parent works.
But surprisingly the median of the children with both the parents donot work is very high compared to the other two. This might indicate the children are working hard whose parents don't have any employement and are achieving highest grades.
All three categories have the lowest possible outlier 0
import matplotlib.pyplot as plt
Family_support = data['family_support'].value_counts()
cohabitation_counts = data['parent_cohabitation_status'].value_counts()
# Giving custom labels for each category
cohabitation_labels = ['Living Together', 'Living Apart']
family_support_labels = [ 'Family Support', 'No Support',]
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.pie(cohabitation_counts, labels=cohabitation_labels, autopct='%1.1f%%', colors=['lightskyblue', 'Purple'])
plt.title('Parental Cohabitation Status')
plt.subplot(1, 2, 2)
plt.pie(Family_support, labels=family_support_labels, autopct='%1.1f%%', colors=['lightskyblue', 'Purple'])
plt.title('Family Support')
plt.tight_layout()
plt.show()
Figure depicts the distribution of students according to the “Parental Cohabitation Status” and “Family Support”. The first pie chart illustrates that the majority of the students have their parents living together which contributes to nearly 90% of the students, whereas only 10% of the students' parents are living apart. The second pie chart illustrates that 61% of the students are supported by their parents for their education and the rest of them are not supported.
import matplotlib.pyplot as plt
custom_palette = ['skyblue', 'salmon']
plt.figure(figsize=(12, 6))
for i, status in enumerate(data['parent_cohabitation_status'].unique()):
data_subset = data[data['parent_cohabitation_status'] == status]['grade_period_3']
plt.hist(data_subset, bins=20, alpha=0.5, label=f'Parent Cohabitation: {status}', color=custom_palette[i])
plt.title('Histogram of Math Scores by Parent Cohabitation Status')
plt.xlabel('Math Score (G3)')
plt.ylabel('Frequency')
plt.legend()
plt.show()
We can see from Figure that most of the parents are living together and hence the pink part of the visualization is huge compared to the blue part which is behind the pink ones. It can be seen that most of the students have scored the median mark which is 10 - 11. There are some students who got 0 in the final grade which is a limitation and has to be further analyzed.
import seaborn as sns
import matplotlib.pyplot as plt
custom_palette = ['skyblue', 'salmon']
plt.figure(figsize=(12, 6))
if (data['grade_period_3'] < 0).any():
raise ValueError("Negative values found in 'G3' column.")
for i, status in enumerate(data['parent_cohabitation_status'].unique()):
sns.kdeplot(data[data['parent_cohabitation_status'] == status]['grade_period_3'], label=f'Parent Cohabitation: {status}', shade=True, color=custom_palette[i], bw_adjust=1.0)
plt.title('Distribution of Math Scores by Parent Cohabitation Status')
plt.xlabel('Math Score (G3)')
plt.ylabel('Density')
plt.legend()
plt.show()
The main aim of the density plot is to visualize the mean and median of the particular column and also decide whether the values in that particular column are normally distributed or not. Looking into the graph, we can see that both of the categories have the mean almost equal and same. The shapes of the graphs are almost similar. The range of math scores is from 0 to 20 according to the data set. But due to flattening and smoothing of data it appears that the scores start from -5 and end at 25 which is only for the purpose of plotting the graph.
data['parent_cohabitation_status'].value_counts()
parent_cohabitation_status T 354 A 41 Name: count, dtype: int64
plt.figure(figsize=(10, 6))
sns.boxplot(x='family_relationship_quality', y='grade_period_3', data=data, palette='Blues')
plt.title('Relationship with Family and Math Score')
plt.xlabel('Family Relationship Quality')
plt.ylabel('Math Score (G3)')
plt.show()
Figure illustrates the distribution of math scores for each possible scale of Family Relationship Quality. The Family Relationship is measured on a scale of 1 to 5 where 1 being the lowest and 5 being the highest. We can observe that there is no specific relationship or trend that is being followed in the graph. At this point of time we can’t conclude anything without solid statistical evidence. The median of scores is decreasing from 1 to 3 whereas it started increasing from 3 to 5.
from scipy.stats import norm
def z_test(series1, series2):
# Calculate the mean and standard deviation of the two series
mean1 = series1.mean()
mean2 = series2.mean()
std1 = series1.std()
std2 = series2.std()
# Calculate the standard error of the difference in means
se = ((std1**2) / len(series1) + (std2**2) / len(series2))**0.5
# Calculate the z-statistic
z = (mean1 - mean2) / se
# Calculate the p-value
p_value = 2 * (1 - norm.cdf(abs(z)))
return z, p_value
from statsmodels.stats.weightstats import ztest
group1 = data[data['parent_cohabitation_status'] == 'T']['grade_period_3'] # Parents living together
group2 = data[data['parent_cohabitation_status'] == 'A']['grade_period_3'] # Parents living apart
statistic, p_value = ztest(group1, x2=group2, alternative='two-sided', usevar='pooled', value=abs(np.mean(group1) - np.mean(group2)))
mean_group1 = np.mean(group1)
mean_group2 = np.mean(group2)
std_dev_group1 = np.std(group1)
std_dev_group2 = np.std(group2)
# Calculating the confidence interval
ci_low, ci_high = (mean_group1 - mean_group2) - 1.96 * np.sqrt(std_dev_group1**2/len(group1) + std_dev_group2**2/len(group2)), \
(mean_group1 - mean_group2) + 1.96 * np.sqrt(std_dev_group1**2/len(group1) + std_dev_group2**2/len(group2))
# Printing results
print(f"P-value for Parent Cohabitation Status: {p_value}")
print(f"Confidence Interval: ({ci_low:.2f}, {ci_high:.2f})")
print(f"Mean (Parents living together: yes): {mean_group1:.2f}")
print(f"Mean (Parents living together: no): {mean_group2:.2f}")
print(f"Standard Deviation (Parents living together: yes): {std_dev_group1:.2f}")
print(f"Standard Deviation (Parents living together: no): {std_dev_group2:.2f}")
P-value for Parent Cohabitation Status: 0.021231279886913746 Confidence Interval: (-2.25, 0.51) Mean (Parents living together: yes): 10.32 Mean (Parents living together: no): 11.20 Standard Deviation (Parents living together: yes): 4.60 Standard Deviation (Parents living together: no): 4.24
from statsmodels.stats.weightstats import ztest
df_copy = data[data['age'] < 18]
group1 = df_copy[df_copy['parent_cohabitation_status'] == 'T']['grade_period_3'] # Parents living together
group2 = df_copy[df_copy['parent_cohabitation_status'] == 'A']['grade_period_3'] # Parents living apart
statistic, p_value = ztest(group1, x2=group2, alternative='two-sided', usevar='pooled', value=abs(np.mean(group1) - np.mean(group2)))
mean_group1 = np.mean(group1)
mean_group2 = np.mean(group2)
std_dev_group1 = np.std(group1)
std_dev_group2 = np.std(group2)
# Calculate the confidence interval
ci_low, ci_high = (mean_group1 - mean_group2) - 1.96 * np.sqrt(std_dev_group1**2/len(group1) + std_dev_group2**2/len(group2)), \
(mean_group1 - mean_group2) + 1.96 * np.sqrt(std_dev_group1**2/len(group1) + std_dev_group2**2/len(group2))
# Print results
print(f"P-value for Parent Cohabitation Status: {p_value}")
print(f"Confidence Interval: ({ci_low:.2f}, {ci_high:.2f})")
print(f"Mean (Parents living together: yes): {mean_group1:.2f}")
print(f"Mean (Parents living together: no): {mean_group2:.2f}")
print(f"Standard Deviation (Parents living together: yes): {std_dev_group1:.2f}")
print(f"Standard Deviation (Parents living together: no): {std_dev_group2:.2f}")
P-value for Parent Cohabitation Status: 0.9219454729056976 Confidence Interval: (-1.68, 1.59) Mean (Parents living together: yes): 10.83 Mean (Parents living together: no): 10.87 Standard Deviation (Parents living together: yes): 4.37 Standard Deviation (Parents living together: no): 4.38
## Same test for Adults (Parents living together vs. Parents living apart)
from statsmodels.stats.weightstats import ztest
df_copy = data[data['age'] > 18]
group1 = df_copy[df_copy['parent_cohabitation_status'] == 'T']['grade_period_3'] # Parents living together
group2 = df_copy[df_copy['parent_cohabitation_status'] == 'A']['grade_period_3'] # Parents living apart
statistic, p_value = ztest(group1, x2=group2, alternative='two-sided', usevar='pooled', value=abs(np.mean(group1) - np.mean(group2)))
mean_group1 = np.mean(group1)
mean_group2 = np.mean(group2)
std_dev_group1 = np.std(group1)
std_dev_group2 = np.std(group2)
# Calculate the confidence interval
ci_low, ci_high = (mean_group1 - mean_group2) - 1.96 * np.sqrt(std_dev_group1**2/len(group1) + std_dev_group2**2/len(group2)), \
(mean_group1 - mean_group2) + 1.96 * np.sqrt(std_dev_group1**2/len(group1) + std_dev_group2**2/len(group2))
# Print results
print(f"P-value for Parent Cohabitation Status: {p_value}")
print(f"Confidence Interval: ({ci_low:.2f}, {ci_high:.2f})")
print(f"Mean (Parents living together: yes): {mean_group1:.2f}")
print(f"Mean (Parents living together: no): {mean_group2:.2f}")
print(f"Standard Deviation (Parents living together: yes): {std_dev_group1:.2f}")
print(f"Standard Deviation (Parents living together: no): {std_dev_group2:.2f}")
P-value for Parent Cohabitation Status: 0.00013784392530664805 Confidence Interval: (-8.35, -0.91) Mean (Parents living together: yes): 8.12 Mean (Parents living together: no): 12.75 Standard Deviation (Parents living together: yes): 4.49 Standard Deviation (Parents living together: no): 3.34
from statsmodels.stats.weightstats import ztest
group1 = data[data['family_support'] == 'yes']['grade_period_3'] # Family support: yes
group2 = data[data['family_support'] == 'no']['grade_period_3'] # Family support: no
statistic, p_value = ztest(group1, x2=group2, alternative='two-sided', usevar='pooled', value=abs(np.mean(group1) - np.mean(group2)))
mean_group1 = np.mean(group1)
mean_group2 = np.mean(group2)
std_dev_group1 = np.std(group1)
std_dev_group2 = np.std(group2)
# Calculating the confidence interval
ci_low, ci_high = (mean_group1 - mean_group2) - 1.96 * np.sqrt(std_dev_group1**2/len(group1) + std_dev_group2**2/len(group2)), \
(mean_group1 - mean_group2) + 1.96 * np.sqrt(std_dev_group1**2/len(group1) + std_dev_group2**2/len(group2))
# Printing results
print(f"P-value for Family Support: {p_value}")
print(f"Confidence Interval: ({ci_low:.2f}, {ci_high:.2f})")
print(f"Mean (Family Support: yes): {mean_group1:.2f}")
print(f"Mean (Family Support: no): {mean_group2:.2f}")
print(f"Standard Deviation (Family Support: yes): {std_dev_group1:.2f}")
print(f"Standard Deviation (Family Support: no): {std_dev_group2:.2f}")
P-value for Family Support: 0.12025314965674688 Confidence Interval: (-1.30, 0.56) Mean (Family Support: yes): 10.27 Mean (Family Support: no): 10.64 Standard Deviation (Family Support: yes): 4.54 Standard Deviation (Family Support: no): 4.62
from statsmodels.stats.weightstats import ztest
group1 = data[data['family_relationship_quality']<4]['grade_period_3'] # Not good Family relationship
group2 = data[data['family_relationship_quality']>=4]['grade_period_3'] # Good Family relationship
# Perform the two-tailed z-test
statistic, p_value = ztest(group1, x2=group2, alternative='two-sided', usevar='pooled', value=abs(np.mean(group1) - np.mean(group2)))
mean_group1 = np.mean(group1)
mean_group2 = np.mean(group2)
std_dev_group1 = np.std(group1)
std_dev_group2 = np.std(group2)
# Calculate the confidence interval
ci_low, ci_high = (mean_group1 - mean_group2) - 1.96 * np.sqrt(std_dev_group1**2/len(group1) + std_dev_group2**2/len(group2)), \
(mean_group1 - mean_group2) + 1.96 * np.sqrt(std_dev_group1**2/len(group1) + std_dev_group2**2/len(group2))
# Print results
print(f"P-value for Family Relationship status: {p_value}")
print(f"Confidence Interval: ({ci_low:.2f}, {ci_high:.2f})")
print(f"Mean (Bad Family Relationship): {mean_group1:.2f}")
print(f"Mean (Good Family Relationship): {mean_group2:.2f}")
print(f"Standard Deviation (Bad Family Relationship): {std_dev_group1:.2f}")
print(f"Standard Deviation (Good Family Relationship): {std_dev_group2:.2f}")
P-value for Family Relationship status: 0.08857309537975457 Confidence Interval: (-1.55, 0.63) Mean (Bad Family Relationship): 10.06 Mean (Good Family Relationship): 10.52 Standard Deviation (Bad Family Relationship): 4.77 Standard Deviation (Good Family Relationship): 4.51